<!DOCTYPE HTML>
<html lang="zh-CN">


<head>
    <meta charset="utf-8">
    <meta name="keywords" content="Spark机器学习笔记, java,nodejs">
    <meta name="description" content="机器学习机器学习是数据通过算法构建出模型并对模型进行评估，评估的性能如果达到要求就拿这个模型来测试其他数据，如果达不到要求就要调整算法来重新建立模型，再次进行评估，如此循环反复，最终获得满意的经验来处理其他数据。



监督学习

监督是从">
    <meta http-equiv="X-UA-Compatible" content="IE=edge">
    <meta name="viewport" content="width=device-width, initial-scale=1.0, user-scalable=no">
    <meta name="renderer" content="webkit|ie-stand|ie-comp">
    <meta name="mobile-web-app-capable" content="yes">
    <meta name="format-detection" content="telephone=no">
    <meta name="apple-mobile-web-app-capable" content="yes">
    <meta name="apple-mobile-web-app-status-bar-style" content="black-translucent">
    <title>Spark机器学习笔记 | libo 工作学习记录</title>
    <link rel="icon" type="image/png" href="/favicon.png">

    <link rel="stylesheet" type="text/css" href="/libs/awesome/css/all.css">
    <link rel="stylesheet" type="text/css" href="/libs/materialize/materialize.min.css">
    <link rel="stylesheet" type="text/css" href="/libs/aos/aos.css">
    <link rel="stylesheet" type="text/css" href="/libs/animate/animate.min.css">
    <link rel="stylesheet" type="text/css" href="/libs/lightGallery/css/lightgallery.min.css">
    <link rel="stylesheet" type="text/css" href="/libs/highlight/styles/monokai.css">
    <link rel="stylesheet" type="text/css" href="/css/matery.css">
    <link rel="stylesheet" type="text/css" href="/css/my.css">
    
    <script src="/libs/jquery/jquery.min.js"></script>
    <script data-ad-client="ca-pub-6820955780229803" async src="https://pagead2.googlesyndication.com/pagead/js/adsbygoogle.js"></script>
<meta name="generator" content="Hexo 5.2.0"></head>


<body>
    <header class="navbar-fixed">
    <nav id="headNav" class="bg-color nav-transparent">
        <div id="navContainer" class="nav-wrapper head-container">
            <div class="brand-logo">
                <a href="/" class="waves-effect waves-light">
                    
                    <img src="/medias/logo.png" class="logo-img" alt="LOGO">
                    
                    <span class="logo-span">libo 工作学习记录</span>
                </a>
            </div>
            

<a href="#" data-target="mobile-nav" class="sidenav-trigger button-collapse"><i class="fas fa-bars"></i></a>
<ul class="right nav-menu">
  
  <li class="hide-on-med-and-down nav-item">
    
    <a href="/" class="waves-effect waves-light">
      
      <i class="fas fa-home" style="zoom: 0.6;"></i>
      
      <span>首页</span>
    </a>
    
  </li>
  
  <li class="hide-on-med-and-down nav-item">
    
    <a href="/tags" class="waves-effect waves-light">
      
      <i class="fas fa-tags" style="zoom: 0.6;"></i>
      
      <span>标签</span>
    </a>
    
  </li>
  
  <li class="hide-on-med-and-down nav-item">
    
    <a href="/categories" class="waves-effect waves-light">
      
      <i class="fas fa-bookmark" style="zoom: 0.6;"></i>
      
      <span>分类</span>
    </a>
    
  </li>
  
  <li class="hide-on-med-and-down nav-item">
    
    <a href="/archives" class="waves-effect waves-light">
      
      <i class="fas fa-archive" style="zoom: 0.6;"></i>
      
      <span>归档</span>
    </a>
    
  </li>
  
  <li class="hide-on-med-and-down nav-item">
    
    <a href="/about" class="waves-effect waves-light">
      
      <i class="fas fa-user-circle" style="zoom: 0.6;"></i>
      
      <span>关于</span>
    </a>
    
  </li>
  
  <li class="hide-on-med-and-down nav-item">
    
    <a href="/contact" class="waves-effect waves-light">
      
      <i class="fas fa-comments" style="zoom: 0.6;"></i>
      
      <span>留言板</span>
    </a>
    
  </li>
  
  <li class="hide-on-med-and-down nav-item">
    
    <a href="/friends" class="waves-effect waves-light">
      
      <i class="fas fa-address-book" style="zoom: 0.6;"></i>
      
      <span>友情链接</span>
    </a>
    
  </li>
  
  <li>
    <a href="#searchModal" class="modal-trigger waves-effect waves-light">
      <i id="searchIcon" class="fas fa-search" title="搜索" style="zoom: 0.85;"></i>
    </a>
  </li>
</ul>

<div id="mobile-nav" class="side-nav sidenav">

    <div class="mobile-head bg-color">
        
        <img src="/medias/logo.png" class="logo-img circle responsive-img">
        
        <div class="logo-name">libo 工作学习记录</div>
        <div class="logo-desc">
            
            后台开发、编程、学习记录
            
        </div>
    </div>

    

    <ul class="menu-list mobile-menu-list">
        
        <li class="m-nav-item">
	  
		<a href="/" class="waves-effect waves-light">
			
			    <i class="fa-fw fas fa-home"></i>
			
			首页
		</a>
          
        </li>
        
        <li class="m-nav-item">
	  
		<a href="/tags" class="waves-effect waves-light">
			
			    <i class="fa-fw fas fa-tags"></i>
			
			标签
		</a>
          
        </li>
        
        <li class="m-nav-item">
	  
		<a href="/categories" class="waves-effect waves-light">
			
			    <i class="fa-fw fas fa-bookmark"></i>
			
			分类
		</a>
          
        </li>
        
        <li class="m-nav-item">
	  
		<a href="/archives" class="waves-effect waves-light">
			
			    <i class="fa-fw fas fa-archive"></i>
			
			归档
		</a>
          
        </li>
        
        <li class="m-nav-item">
	  
		<a href="/about" class="waves-effect waves-light">
			
			    <i class="fa-fw fas fa-user-circle"></i>
			
			关于
		</a>
          
        </li>
        
        <li class="m-nav-item">
	  
		<a href="/contact" class="waves-effect waves-light">
			
			    <i class="fa-fw fas fa-comments"></i>
			
			Contact
		</a>
          
        </li>
        
        <li class="m-nav-item">
	  
		<a href="/friends" class="waves-effect waves-light">
			
			    <i class="fa-fw fas fa-address-book"></i>
			
			友情链接
		</a>
          
        </li>
        
        
    </ul>
</div>

        </div>

        
    </nav>

</header>

    



<div class="bg-cover pd-header post-cover" style="background-image: url('/medias/featureimages/15.jpg')">
    <div class="container" style="right: 0px;left: 0px;">
        <div class="row">
            <div class="col s12 m12 l12">
                <div class="brand">
                    <h1 class="description center-align post-title">Spark机器学习笔记</h1>
                </div>
            </div>
        </div>
    </div>
</div>




<main class="post-container content">

    
    <link rel="stylesheet" href="/libs/tocbot/tocbot.css">
<style>
    #articleContent h1::before,
    #articleContent h2::before,
    #articleContent h3::before,
    #articleContent h4::before,
    #articleContent h5::before,
    #articleContent h6::before {
        display: block;
        content: " ";
        height: 100px;
        margin-top: -100px;
        visibility: hidden;
    }

    #articleContent :focus {
        outline: none;
    }

    .toc-fixed {
        position: fixed;
        top: 64px;
    }

    .toc-widget {
        width: 345px;
        padding-left: 20px;
    }

    .toc-widget .toc-title {
        margin: 35px 0 15px 0;
        padding-left: 17px;
        font-size: 1.5rem;
        font-weight: bold;
        line-height: 1.5rem;
    }

    .toc-widget ol {
        padding: 0;
        list-style: none;
    }

    #toc-content {
        height: calc(100vh - 250px);
        overflow: auto;
    }

    #toc-content ol {
        padding-left: 10px;
    }

    #toc-content ol li {
        padding-left: 10px;
    }

    #toc-content .toc-link:hover {
        color: #42b983;
        font-weight: 700;
        text-decoration: underline;
    }

    #toc-content .toc-link::before {
        background-color: transparent;
        max-height: 25px;
    }

    #toc-content .is-active-link {
        color: #42b983;
    }

    #toc-content .is-active-link::before {
        background-color: #42b983;
    }

    #floating-toc-btn {
        position: fixed;
        right: 15px;
        bottom: 76px;
        padding-top: 15px;
        margin-bottom: 0;
        z-index: 998;
    }

    #floating-toc-btn .btn-floating {
        width: 48px;
        height: 48px;
    }

    #floating-toc-btn .btn-floating i {
        line-height: 48px;
        font-size: 1.4rem;
    }
</style>
<div class="row">
    <div id="main-content" class="col s12 m12 l9">
        <!-- 文章内容详情 -->
<div id="artDetail">
    <div class="card">
        <div class="card-content article-info">
            <div class="row tag-cate">
                <div class="col s7">
                    
                    <div class="article-tag">
                        
                            <a href="/tags/spark/">
                                <span class="chip bg-color">spark</span>
                            </a>
                        
                            <a href="/tags/ml/">
                                <span class="chip bg-color">ml</span>
                            </a>
                        
                    </div>
                    
                </div>
                <div class="col s5 right-align">
                    
                </div>
            </div>

            <div class="post-info">
                
                <div class="post-date info-break-policy">
                    <i class="far fa-calendar-minus fa-fw"></i>发布日期:&nbsp;&nbsp;
                    2019-01-13
                </div>
                

                
                <div class="post-date info-break-policy">
                    <i class="far fa-calendar-check fa-fw"></i>更新日期:&nbsp;&nbsp;
                    2019-01-13
                </div>
                

                
                <div class="info-break-policy">
                    <i class="far fa-file-word fa-fw"></i>文章字数:&nbsp;&nbsp;
                    5.1k
                </div>
                

                
                <div class="info-break-policy">
                    <i class="far fa-clock fa-fw"></i>阅读时长:&nbsp;&nbsp;
                    21 分
                </div>
                
				
                
                    <div id="busuanzi_container_page_pv" class="info-break-policy">
                        <i class="far fa-eye fa-fw"></i>阅读次数:&nbsp;&nbsp;
                        <span id="busuanzi_value_page_pv"></span>
                    </div>
				
            </div>
            
        </div>
        <hr class="clearfix">
        <div class="card-content article-card-content">
            <div id="articleContent">
                <h2 id="机器学习"><a href="#机器学习" class="headerlink" title="机器学习"></a>机器学习</h2><p>机器学习是数据通过算法构建出模型并对模型进行评估，评估的性能如果达到要求就拿这个模型来测试其他数据，如果达不到要求就要调整算法来重新建立模型，再次进行评估，如此循环反复，最终获得满意的经验来处理其他数据。</p>
<a id="more"></a>

<ol>
<li>监督学习</li>
</ol>
<p>监督是从给定的训练数据集中学习一个函数(模型)，当新的数据到来时，可以</p>
<h2 id="官方文档地址"><a href="#官方文档地址" class="headerlink" title="官方文档地址"></a><a target="_blank" rel="noopener" href="http://spark.apache.org/docs/latest/ml-guide.html">官方文档地址</a></h2><p>MLlib 是Spark 机器学习库。它的目标是使实际的机器学习规模化，简单化。简单讲，它提供了如下工具：</p>
<ul>
<li>机器学习算法(ML Algorithms)：常用的机器学习算法如 聚类(classification), 回归(regression), 聚簇(clustering), 协同过滤(collaborative filtering)</li>
<li>特征化(Featurization): 特征提取，转换，降维，选择</li>
<li>流水线(Pipelines): 构造，评估，调优(tuning) 机器学习管道(ML Pipelines).</li>
<li>持久化(Persistence): 保存和载入算法，模型，和流水线(Pipelines).</li>
<li>工具：线性代数，统计，数据处理</li>
</ul>
<h2 id="推荐系统-Recommender-system"><a href="#推荐系统-Recommender-system" class="headerlink" title="推荐系统(Recommender system)"></a><a target="_blank" rel="noopener" href="https://en.wikipedia.org/wiki/Recommender_system#Collaborative_filtering">推荐系统(Recommender system)</a></h2><p>推荐系统典型的通过以下两种方式产生推荐列表：通过<a target="_blank" rel="noopener" href="https://en.wikipedia.org/wiki/Collaborative_filtering">协同过滤</a>或者通过<a target="_blank" rel="noopener" href="https://en.wikipedia.org/wiki/Content-based_filtering">基于内容的过滤</a>(基于个性方法)。<a target="_blank" rel="noopener" href="https://en.wikipedia.org/wiki/Collaborative_filtering">协同过滤</a>从用户过去的行为和其他用户的相同决定构建模型。这个模型用于预测物品或者用户感兴趣的。<a target="_blank" rel="noopener" href="https://en.wikipedia.org/wiki/Content-based_filtering">基于内容的过滤</a>方法利用一系列离散的物品特点来推荐另外拥有相同特点的物品。这些方法通常结合为<a target="_blank" rel="noopener" href="https://en.wikipedia.org/wiki/Recommender_system#Hybrid_recommender_systems">混合推荐系统(Hybrid Recommender Systems)</a>.</p>
<h2 id="协同滤波-Collaborative-filter"><a href="#协同滤波-Collaborative-filter" class="headerlink" title="协同滤波(Collaborative filter)"></a>协同滤波(Collaborative filter)</h2><p>设计一个推荐系统广泛使用的一个方法是协同滤波。协同滤波方法基于收集和分析大量用户的行为，活动，喜好，基于与其他用户的相似度预测用户会的喜好。协同滤波的关键优点是它不依赖于机器分析内容，因此，它有能力精确推荐复杂的物品例如电影而不需要去了解物品本身。在推荐系统中很多算法用来测量用户相似度或者物品相似度。例如，k-nearest neighbor (k-NN)方法和 Allen首先实现的<a target="_blank" rel="noopener" href="https://en.wikipedia.org/wiki/Pearson_correlation">皮尔森纠正(Pearson Correlation)</a>.</p>
<p>协同滤波是基于人们过去赞同的未来也会赞同的假设，并且他们会喜欢与他们过去喜欢的物品相似的物品。</p>
<p>当从用户模型构建行为时，一个区别通常是显式的数据收集和隐式的数据收集。</p>
<p>协同滤波著名的例子之一是 item-item的协同滤波， Amazon.com 推荐系统的流行推荐算法。</p>
<p>协同滤波算法一个典型类型是使用<a target="_blank" rel="noopener" href="https://en.wikipedia.org/wiki/Matrix_factorization_(recommender_systems)">矩阵分解</a>，一个(低秩矩阵近似](<a target="_blank" rel="noopener" href="https://en.wikipedia.org/wiki/Low_rank_approximation)%E7%9A%84%E6%96%B9%E6%B3%95%E3%80%82">https://en.wikipedia.org/wiki/Low_rank_approximation)的方法。</a></p>
<p>协同滤波方法是基于内存的分类和基于协同滤波的模型。基于内存方法的一个著名示例是基于用户的算法，基于模型的算法是 <a target="_blank" rel="noopener" href="https://en.wikipedia.org/wiki/Recommender_system#cite_note-42">Kernel-Mapping Recommender</a>。</p>
<p>协同滤波常用于推荐系统。这种技术是为了补充user-item关联矩阵的缺失的入口。spark.mllib目前支持基于模型的协同滤波，用于预测缺失的由一小部分潜在因素描述的用户和产品关系的入口。sparkmllib使用ALS(alternating least squares)(交替最小二乘法)算法来学习这些潜在的因素。spark.mllib的实现拥有下列参数:<br>jiqi</p>
<ul>
<li>numBlocks 是用来并行化计算的块得数量(设置-1自动配置)</li>
<li>rank 是特点数量(通常被引用作为潜在因子数量)</li>
<li>iterations 是ALS 算法要运行的迭代次数</li>
<li>lambda 明确了正则化(regularization)参数</li>
<li>implicitPrefs 明确是否使用ALS变量的精确的反馈或者使用隐式的反馈数据</li>
<li>alpha 是可应用的参数暗示用于控制在偏好观察中基线自信ALS的反馈变量</li>
</ul>
<h2 id="显式-Explicit-vs-隐式反馈-implicit-feedback"><a href="#显式-Explicit-vs-隐式反馈-implicit-feedback" class="headerlink" title="显式(Explicit) vs. 隐式反馈(implicit feedback)"></a>显式(Explicit) vs. 隐式反馈(implicit feedback)</h2><p>基于矩阵分解的协同滤波标准方法user-item矩阵作为具体的偏好，例如，用户给电影的评分。</p>
<p>通常现实世界中使用仅有隐式反馈的样例(视图，点击，购买，喜好，分享等). spark.mllib处理这种数据的方法采纳于隐式数据集的协同滤波(Collaborative Filtering for Implicit Feddback Datasets). 必要的，替代直接试图使用评分矩阵，这种方法将数据看做对用户动作(点击数，用户看电影花费的累计)观察的强度表示.这些数字关系到观察用户偏好的信心水平，而不是对item项显式的评分。这种模型试图找到潜在因子的能够用来预测用户对某个item的预期偏好.</p>
<h2 id="正则参数的规模"><a href="#正则参数的规模" class="headerlink" title="正则参数的规模"></a>正则参数的规模</h2><p>自v1.1版本，我们在更新用户因子方面使用规模化(scale)正则参数lambda来用由用户生成的评分解决每个最小平方问题,或者更新产品因子的评分。这种方法叫做”ALS-WR”在 “<a target="_blank" rel="noopener" href="http://dx.doi.org/10.1007/978-3-540-68880-8_32">Netflix Prize 的大规模协同滤波</a>”这一章讨论过.它使lamda更少的依赖数据集的规模，因此我们可以使用从样本集中学得的最好参数到全部数据集，并且期待相似的性能。</p>
<h2 id="示例"><a href="#示例" class="headerlink" title="示例"></a>示例</h2><p>下列实例，载入评分数据。每一行由一个用户，一个产品，一个评分组成。我们使用默认的假定评分为显式的ALS.train()方法。我们通过测量评分预测中的方差(Mean Squared Error)评估推荐模型.</p>
<p>更多细节参考 <a target="_blank" rel="noopener" href="http://spark.apache.org/docs/2.4.0/api/java/org/apache/spark/mllib/recommendation/ALS.html">ALS Java docs</a></p>
<pre><code>import scala.Tuple2;

import org.apache.spark.api.java.*;
import org.apache.spark.mllib.recommendation.ALS;
import org.apache.spark.mllib.recommendation.MatrixFactorizationModel;
import org.apache.spark.mllib.recommendation.Rating;
import org.apache.spark.SparkConf;

SparkConf conf = new SparkConf().setAppName(&quot;Java Coolaborative Filtering Example&quot;);
JavaSparkContext jsc = new JavaSparkContext(conf);

//load and parse the data
String path = &quot;data/mllib/als/test.data&quot;;
JavaRDD&lt;String&gt; data = jsc.textFile(path);
JavaRDD&lt;Rating&gt; ratings = data.map(s -&gt; &#123;
    String[] sarray = s.split(&quot;,&quot;);
    return new Rating(Integer.parseInt(sarray[0]),
        Integer.parseInt(sarray[1]),
        Double.parseDouble(sarray[2]));
&#125;);

//Build the recommendation model using ALS
int rank = 10;
int numIterations = 10;
MatrixFactorizationModel model = ALS.train(JavaRDD.toRDD(ratings), rank, numIterations, 0.01);

//Evaluate the model on rating data
JavaRDD&lt;Tuple2&lt;Object, Object&gt;&gt; userProducts = rating.map(r -&gt; new Tuple2&lt;&gt;(r.user(), r.product()));
JavaPairRDD&lt;Tuple2&lt;Integer, Integer&gt;, Double&gt; predictions = JavaPairRDD.fromJavaRDD(
    model.predict(JavaRDD.toRDD(userProducts)).toJavaRDD()
         .map(r -&gt; new Tuple&lt;&gt;(new Tuple2&lt;&gt;(r.user(), r.prodduct()), r.rating()))
    );
JavaRDD&lt;Tuple2&lt;Double, Double&gt;&gt; rateAndPreds = JavaPairRDD.fromJavaRDD(
    ratings.map(r -&gt; new Tuple2&lt;&gt;(new Tuple2&lt;&gt;(r.user(), r.product()), r.rating()))
    .join(predictions).values();
double MSE = rateAndPreds.mapToDouble(pair -&gt;&#123;
    double err = pair._1() - pair._2();
    return err * err;
&#125;).mean();
System.out.println(&quot;mean Squared Error = &quot; + MSE ));

//save and load model
model.save(jsc.sc(), &quot;target/tmp/myCollaborativeFilter&quot;);
MatrixFacorizationModel sameModel = MatrixFactorizationModel.load(jsc.sc(),
    &quot;target/tmp/myCollaborativeFilter&quot;);</code></pre>
<h2 id="MLlib-中的聚类和分类"><a href="#MLlib-中的聚类和分类" class="headerlink" title="MLlib 中的聚类和分类"></a>MLlib 中的聚类和分类</h2><p>聚类和分类是机器学习中两个常用的算法，聚类将数据分开为不同的集合，分类对新数据进行类别预测。</p>
<h3 id="聚类和分类"><a href="#聚类和分类" class="headerlink" title="聚类和分类"></a>聚类和分类</h3><p>(1)什么是聚类</p>
<p>聚类(Clustering)是将数据对象分组成多个类或簇(Cluster),它的目标是：在同一个簇中的对象之间具有较高的相似度，不同簇中的对象差别很大。聚类是人们日常生活中的常见行为，“物以类聚，人以群分”,其核心思想在于分组，人们不断改进聚类模式来学习如何区分各个事务和人.</p>
<p>(2)什么是分类</p>
<p>数据仓库、数据库、或其他信息库中有许多可以为商业、科研等活动的决策提供所需要的知识。分类和预测即是其中的两种数据分析形式，可以用来抽取能够描述重要数据集合或预测未来数据趋势。分类方法(Classification)用于预测数据对象的离散类别(Categorical Label)；预测方法(Prediction)用于预测数据对象的连续取值。</p>
<pre><code>**分类流程**: 新样本-&gt;特征选取-&gt;分类-&gt;评价
**训练流程**: 训练集-&gt;特征选取-&gt;训练-&gt;分类器</code></pre>
<p>最初，机器学习的分类应用大多是在这些方法及基于内存基础上所构造的算法。目前，数据挖掘方法都要求具有基于外存以处理大规模数据集合能力，同时具有可扩展能力。</p>
<h3 id="MLlib-中的聚类和分类-1"><a href="#MLlib-中的聚类和分类-1" class="headerlink" title="MLlib 中的聚类和分类"></a>MLlib 中的聚类和分类</h3><p>MLlib目前已经实现了K-Means聚类算法，朴素贝叶斯和决策树分类算法。</p>
<p>(1) K-Means 算法</p>
<p>K-Means聚类算法能够轻松的对聚类问题建模，并且能够在分布式的环境下运行。</p>
<p>K-Means 聚类算法中的K是聚类的数目，在算法中会强制要求用户数据。如果将新闻聚类成注入政治、经济、文化等大类，可以选择10<del>20的数字作为K。因为这种顶级类别的数量是很小的。如果要对这些新闻详细分类，选择50</del>100的数字也是没有问题的。</p>
<p>K-Means聚类算法主要可以分为三步。第一步是为待聚类的点寻找聚类中心；第二部是计算每个点聚类中心的距离，将每个点聚类到离该点最近的聚类中去；第三部是计算聚类中所有点的坐标平均值，并将这个平均值作为新的聚类中心点。反复执行第二部，知道聚类中心不再进行大范围的移动，或者聚类次数达到要求为止。</p>
<p>(2)MLlib 之 K-Means源码解析</p>
<p>MLlib的K-Means的原理是：在同一个数据集上，跑多个K-Means算法(每个成为一个run),然后返回效果最好的那个聚类的类簇中心。初始的类簇中心店的选取有两种方法，一种是随机，另一种是采用KMeans||(KMeans++ 的xianshi法的停止条件是迭代次数达到设置的次数，或者在某一次迭代后所有run的K-Means算法都收敛。</p>
<ol>
<li>类簇中心初始化</li>
</ol>
<p>对每个运行的K-Means随机选择K个点作为初始类簇：</p>
<pre><code>private def initRandom(data: RDD[Array[Double]]): Array[ClusterCenters] = &#123;
    //Sample all the cluster centers in one pass to avoid repeated scans
val sample = data.takeSample(true, runs * k, new Random().nextInt()).toSeqArray
.tabulate(runs)(r =&gt; sample.slice(r * k, (r + 1).toArray))
&#125;</code></pre>
<ol start="2">
<li>计算属于某个类簇的点</li>
</ol>
<p>在每一次迭代中，首先会计算属于各个类簇的点，然后更新各个类簇的中心</p>
<pre><code>//K-Means算法的并行实现通过Spark的mapPartitions函数，通过该函数获取到分区的迭代器。可以在每个分区内计算该分区内的点属于哪个类簇，之后对于每个运行算法中的每个类簇计算属于该类簇的点的个数以及累加和。

val totalContribs = data.mapPartitions &#123; points =&gt;
val runs = activeCenters.length
val k = activeCenters(0).length
val dims = activeCenters(0)(0).length

val sums = Array.fill(runs, k)(new DoubleMatrix(dims))
val counts = Array.fill(runs, k)(0L)

for(point &lt;- points; (centers, runIndex) &lt;- activeCenters.zipWithIndex)&#123;
//找到距离改点最近的聚类中心点
val (bestCenter, const) = KMeans.findClosest(centers, point)
//统计该运行算法开销
costAccum(runIndex) += cost
//将距离该点最近的类簇的点数量加1，sum.divi(count)就是了I类簇的新中心
counts(runIndex)(bestCenter) +=1
&#125;

val contribs = for(i &lt;- until runs; j &lt;- 0 until k) yield&#123;
    ((i,j), (sums(i)(j), counts(i)(j)))
&#125;
contribs.iterator
//对于每个运行算法的每个类簇计算属于该类簇的点的个数和加和
&#125;.reduceByKey(mergeContribs).collectAsMap()</code></pre>
<h2 id="DataFrame-based-API-is-primary-API"><a href="#DataFrame-based-API-is-primary-API" class="headerlink" title="DataFrame-based API is primary API"></a><a target="_blank" rel="noopener" href="http://spark.apache.org/docs/latest/mllib-statistics.html">DataFrame-based API is primary API</a></h2><p>0, 基于RDD的API在spark.mllib 中现在进入维护状态。现在Spark主要的机器学习API是在spark.ml包中的基于 DataFrame 的API.</p>
<h2 id="MLlib-RDD-based-API"><a href="#MLlib-RDD-based-API" class="headerlink" title="MLlib: RDD-based API"></a>MLlib: RDD-based API</h2><pre><code>package spark.mllib</code></pre>
<h3 id="DataTypes"><a href="#DataTypes" class="headerlink" title="DataTypes"></a>DataTypes</h3><p>MLlib支持向量和矩阵存储在单机上，同样支持存储在一个或多个RDD的分布式矩阵。本地向量和本地矩阵是服务于公共接口的简单数据模型。底层的线性代数操作有<a target="_blank" rel="noopener" href="http://www.scalanlp.org/">Breeze</a>提供。</p>
<h4 id="本地向量-Local-vector"><a href="#本地向量-Local-vector" class="headerlink" title="本地向量(Local vector)"></a>本地向量(Local vector)</h4><p>本地向量有整数类型的和0为索引开始的和双类型的值，存储在单机上。MLlib支持两种类型的本地向量：稠密和稀疏的。一个稠密向量有一个二位数组支持表示它的入口值，而一个稀疏矩阵由两个并行数组支持：索引和值。例如，一个向量(1.0, 0.0, 3.0)可以被表示为稠密形式[1.0, 0.0, 3.0] 或者稀疏形式(3,[0,2],[1.0,3.0]), 3 是向量大小。</p>
<p>本地向量类是<a target="_blank" rel="noopener" href="http://spark.apache.org/docs/latest/api/java/org/apache/spark/mllib/linalg/Vector.html">Vector</a>,我们提供两种实现：<a target="_blank" rel="noopener" href="http://spark.apache.org/docs/latest/api/java/org/apache/spark/mllib/linalg/DenseVector.html">DenseVector</a> 和<a target="_blank" rel="noopener" href="http://spark.apache.org/docs/latest/api/java/org/apache/spark/mllib/linalg/SparseVector.html">SparseVector</a>. 我们推荐使用Vectors 中实现的工厂方法创建本地向量，参考<a target="_blank" rel="noopener" href="http://spark.apache.org/docs/latest/api/java/org/apache/spark/mllib/linalg/Vector.html">Ｖector Java docs</a> 和<a target="_blank" rel="noopener" href="http://spark.apache.org/docs/latest/api/java/org/apache/spark/mllib/linalg/Vectors.html">Vectors Java docs</a>查看更多细节。</p>
<pre><code>import org.apache.spark.mllib.linalg.Vector;
import org.apache.spark.mllib.linalg.Vectors;

Vector dv = Vectors.dense(1.0, 0.0, 3.0);
Vector sv = Vectors.sparse(3, new int[]&#123;0,2&#125;, new double[]&#123;1.0, 3.0&#125;);</code></pre>
<h4 id="标签点-Labeled-point"><a href="#标签点-Labeled-point" class="headerlink" title="标签点(Labeled point)"></a>标签点(Labeled point)</h4><p>一个标签点是一个本地向量，稀疏的或者稠密的，与一个标签/回复相关联。在ＭLlib中，标签点在有监督的学习算法(supervised learning algorithms)中使用. 我们使用double存储一个标签，如此我们就能在回归(regression)和聚类(classification)中使用标签点了。对于二进制分类，一个标签可能是０或１．对于多分类，标签可能是从０，１，２，３开始的索引。</p>
<p>一个标签点表示为<a target="_blank" rel="noopener" href="http://spark.apache.org/docs/latest/api/java/org/apache/spark/mllib/regression/LabeledPoint.html">LabeledPoint</a></p>
<pre><code>import org.apache.spark.mllib.linalg.Vectors;
import org.apache.spark.mllib.regression.LabeledPoint;

LabeledPoint pos = new LabeledPoint(1.0, Vectors.dense(1.0, 0.0, 3.0));\
LabeledPoint neg = new LabeledPoint(0.0, Vectors.sparse(3, new int[]&#123;0,2&#125;, new double[]&#123;1.0, 3.0&#125;));</code></pre>
<h5 id="Sparse-data"><a href="#Sparse-data" class="headerlink" title="Sparse data"></a>Sparse data</h5><p>实际中很常见的是拥有稀疏的训练数据。MLlib 支持读取存储为LIBSVM格式的训练示例，　是<a target="_blank" rel="noopener" href="http://www.csie.ntu.edu.tw/~cjlin/libsvm/">LIBSVM</a>　和　<a target="_blank" rel="noopener" href="http://www.csie.ntu.edu.tw/~cjlin/liblinear/">LIBLINEAR</a>　默认的数据格式。它是一种每一个使用如下方式表示稀疏标签特征向量的文本格式。</p>
<pre><code>label index1:value1 index2:value2</code></pre>
<p>索引以递增的顺序，载入完成后，特征索引被转换为以０作为开始的索引。</p>
<p><a target="_blank" rel="noopener" href="http://spark.apache.org/docs/latest/api/java/org/apache/spark/mllib/util/MLUtils.html">MLUtils.loadLibSVMFile</a>读取存储为LIBSVM格式的训练示例。</p>
<pre><code>import org.apache.spark.mllib.regression.LabeledPoint;
import org.apache.spark.mllib.util.MLUtils;
import org.apache.spark.api.java.JavaRDD;

JavaRDD&lt;LabeledPoint&gt; examples = MLUtils.loadLibSVMFile(jsc.sc(), &quot;data/mllib/sample_libsvm_data.txt&quot;).toJavaRDD();</code></pre>
<h4 id="本地矩阵"><a href="#本地矩阵" class="headerlink" title="本地矩阵"></a>本地矩阵</h4><p>一个本地矩阵是一个有整数类型的行和列索引的双类型值，存储在单机上。ＭL支持稠密矩阵，</p>
<p>本地矩阵的基础类是<a target="_blank" rel="noopener" href="http://spark.apache.org/docs/latest/api/java/org/apache/spark/mllib/linalg/Matrix.html">Matrix</a>,我们提供两种实现：<a target="_blank" rel="noopener" href="http://spark.apache.org/docs/latest/api/java/org/apache/spark/mllib/linalg/SparseMatrix.html">SparseＭatrix</a>和<a target="_blank" rel="noopener" href="http://spark.apache.org/docs/latest/api/java/org/apache/spark/mllib/linalg/DenseMatrix.html">DenseMatrix</a>. 我们推荐使用Matrices中的工厂方法。记住，本地矩阵按列存储。</p>
<pre><code>import org.apache.spark.mllib.linalg.Matrix;
import org.apache.spark.mllib.linalg.Matrices;

Matrix dm = Matrices.dense(3, 2, new double[]&#123;1.0, 3.0, 5.0, 2.0, 4.0, 6.0&#125;);
Matrix sm = Matrices.sparse(3, 3, new int[]&#123;0, 1, 3&#125;, new int[]&#123;0,2,1&#125;, new double[]&#123;9, 6, 8&#125;);</code></pre>
<h3 id="分布式矩阵"><a href="#分布式矩阵" class="headerlink" title="分布式矩阵"></a>分布式矩阵</h3><p>一个分布式矩阵是一个行列索引为long类型的双精度类型的值，在一个或多个RDD上分布存储。选择合适的格式存储大规模和分布式矩阵是很重要的。转换一个分布式矩阵到不同的合适可能需要全局的梳理(shuffle)，代价是昂贵的。目前已经实现了四种类型的分布式矩阵。</p>
<p>基础类型是 RowMatrix. 一个行矩阵是一个面向行分布的矩阵没有无意义的行索引， 例如一个集合向量集合。它有RDD的一行为基础，每一行是一个本地向量。我们假定行矩阵列的数量不是很巨大，这样单本地向量可以合理的与驱动程序交流沟通，因此能够在单个节点上存储，操作。一个有索引的行矩阵(IndexedRowMatrix)相似于带索引的行矩阵(RowMatrix), IndexedRowMatrix可以用来索引行和执行join操作。一个坐标矩阵(CoordinateMatrix) 是分布式矩阵存储为坐标列表(<a target="_blank" rel="noopener" href="https://en.wikipedia.org/wiki/Sparse_matrix#Coordinate_list_.28COO.29">coordianate list</a>)的格式,由RDD入口提供支持。一个块矩阵(BlockMatrix)是一个分布式矩阵由Matrix RDD(是一个(Int, Int, Matrix)的元祖)支持。</p>
<h4 id="RowMatrix"><a href="#RowMatrix" class="headerlink" title="RowMatrix"></a>RowMatrix</h4><pre><code>import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.mllib.linalg.Vector;
import org.apache.spark.mllib.linalg.distributed.RowMatrix;

JavaRDD&lt;Vector&gt; rows = ...;
RowMatrix mat = new RowMatrix(rows.rdd());

long m = mat.numRows()
long n = mat.numCols()</code></pre>
<h4 id="IndexedRowMatrix"><a href="#IndexedRowMatrix" class="headerlink" title="IndexedRowMatrix"></a>IndexedRowMatrix</h4><p>一个 <a target="_blank" rel="noopener" href="http://spark.apache.org/docs/latest/api/java/org/apache/spark/mllib/linalg/distributed/IndexedRowMatrix.html">IndexedRowMatrix</a>可以从一个JavaRDD<IndexedRow> 实例创建，IndexedRow 是一个(long, Vector)的封装。一个IndexRowMatrix 可以去掉索引转换为RowMatrix.</p>
<pre><code>import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.mllib.linalg.distributed.IndexedRow;
import org.apache.spark.mllib.linalg.distributed.IndexedRowMatrix;
import org.apache.spark.mllib.linalg.distributed.RowMatrix;

JavaRDD&lt;IndexedRow&gt; rows = ...;
IndexedRowMatrix mat = new IndexedRowMatrix(rows.rdd());
long m = mat.numRows();
long n = mat.numCols();</code></pre>
<h4 id="CoordinateMatrix"><a href="#CoordinateMatrix" class="headerlink" title="CoordinateMatrix"></a>CoordinateMatrix</h4><p>CoordinateMatrix是有RDD入口提供支持的分布式矩阵。每个入口是(i: Long, j: Long, value: Double)类型的元组. CoordinateMatrix 只应该在矩阵维度很大而且矩阵非常稀疏的情况下才能使用。</p>
<p>CoordinateMatrix 可以从一个JavaRDD<MatrixEntry>实例创建，MatrixEntry是一个(long,long,double)的封装， CoordinateMatrix 可以转换为IndexedRowMatrix通过调用toIndexedRowMatrix.</p>
<pre><code>import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.mllib.linalg.distributed.CoordinateMatrix;
import org.apache.spark.mllib.linalg.distributed.IndexedRowMatrix;
import org.apache.spark.mllib.linalg.distributed.MatrixEntry;

JavaRDD&lt;MatrixEntry&gt; entries = ...
CoordinateMatrix mat = new CoordinateMatrix(entries.rdd());

long m = mat.numRows();
long n = mat.numCols();

IndexedRowMatrix indexedRowMatrix = mat.toIndexedRowMatrix();</code></pre>
<h4 id="BlockMatrix"><a href="#BlockMatrix" class="headerlink" title="BlockMatrix"></a>BlockMatrix</h4><p>BlockMatrix 有RDD的MatrixBlocks提供支持，MatrixBlocks是一个((Int, Int), Matrix)类型的元祖,(Int,Int)是块索引，Matrix 是一个子矩阵。 BlockMatrix 支持add，multiply操作，validate()方法用于验证BlockMatrix是否合适的创建。</p>
<pre><code>JavaRDD&lt;MatrixEntry&gt; entries=...;
CoordinateMatrix coorMat = new CoordinateMatrix(entries.rdd());
BlockMatrix matA = coorMat.toBlockMatrix().cache();
mat.validate();
BlocakMatrix ata = matA.transpose().multiply(matA);</code></pre>
<h2 id="基于RDD-Api的基础的统计-Basic-Statistics-RDD-based-API"><a href="#基于RDD-Api的基础的统计-Basic-Statistics-RDD-based-API" class="headerlink" title="基于RDD Api的基础的统计(Basic Statistics -RDD-based API)"></a>基于RDD Api的基础的统计(Basic Statistics -RDD-based API)</h2><p>我们通过在Statistics中的colStats函数获得RDD[Vector]的列总结统计。</p>
<pre><code>import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.mllib.linalg.Vector;
import org.apache.spark.mllib.linalg.Vectors;

import org.apache.spark.mllib.stat.MultivariateStatisticalSummary;
import org.apache.spark.mllib.stat.Statistics;

JavaRDD&lt;Vector&gt; mat = jsc.parallelize(
    Arrays.asList(
        Vectors.dense(1.0, 10.0, 100.0);
        Vectors.dense(2.0, 20.0, 200.0);
        Vectors.dense(3.0, 30.0, 300.0);
    )
);

MultivariateStatisticalSummary summary = Statistics.colStats(mat.rdd());
System.out.println(summary.mean());    
System.out.println(summary.variance());            //列宽变量
System.out.println(summary.numNonzeros(());        //每一列的非0数</code></pre>
<h3 id="相关性"><a href="#相关性" class="headerlink" title="相关性"></a>相关性</h3><p>统计学中计算两列数据的相关性是很常见的。在spark.mllib中我们提供了在很多列之间计算列祖相关的灵活性。目前支持的相关性方法是 Pearson 和 Spearman 相关。</p>
<pre><code>[Statistics](http://spark.apache.org/docs/latest/api/java/org/apache/spark/mllib/stat/Statistics.html)提供了方法计算两列之间的相关性。取决于输入类型，两个JavaDoubleRDDs 或者JavaRDD&lt;Vector&gt;,输出相应的为Double或者相关矩阵。

import java.util.Arrays;

import org.apache.spark.api.java.JavaDoubleRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.mllib.linalg.Matrix;
import org.apache.spark.mllib.linalg.Vector;
import org.apache.spark.mllib.linalg.Vectors;
import org.apache.spark.mllib.stat.Statistics;

//与seriesX必须拥有相同的分区
JavaDoubleRDD seriesX = jsc.parallelizeDoubles(
    Arrays.asList(11.0, 22.0, 33.0, 33.0, 555.0)
);

//使用Pearson计算相关性
//如果没有明确方法，Pearson是默认方法
Double correlation = Statics.corr(seriesX.srdd(), seriesY.srdd(), &quot;pearson&quot;);
System.out.println(&quot;Correalation is: &quot; + corrlation);

JavaRDD&lt;Vector&gt; data = jsc.parallelize(
    Arrays.asList(
        Vectors.dense(1.0, 10.0, 100.0),
        Vectors.dense(2.0, 20.0, 200.0),
        Vectors.dense(5.0, 33.0, 366.0)
    )
);

//计算相关性矩阵使用Peason&#39;s 方法
//使用 “spearman&quot; 作为Spearman方法</code></pre>
<h3 id="分层样本-Stratified-sampleing"><a href="#分层样本-Stratified-sampleing" class="headerlink" title="分层样本(Stratified sampleing)"></a>分层样本(Stratified sampleing)</h3><p>不想其他统计工具，</p>
<h2 id="基于RDD的聚类和回归-Classfication-and-Regression-RDD-based"><a href="#基于RDD的聚类和回归-Classfication-and-Regression-RDD-based" class="headerlink" title="基于RDD的聚类和回归(Classfication and Regression - RDD-based)"></a><a target="_blank" rel="noopener" href="http://spark.apache.org/docs/latest/mllib-classification-regression.html">基于RDD的聚类和回归(Classfication and Regression - RDD-based)</a></h2><p>spark.mllib包支持各种方法，二分聚类，多类聚类，回归分析。下表概括了每种类型的支持算法。</p>
<table>
<thead>
<tr>
<th align="left">问题类型(Problem Type)</th>
<th align="left">支持的方法(Supported Method)</th>
</tr>
</thead>
<tbody><tr>
<td align="left">二分聚类(Binary Classification)</td>
<td align="left">线性 SVMS(LINEAR SVMs), 逻辑回归(logistic regression), 决策树(decision trees), 随机森林(random forests), 梯度增加树(gradient-bosted trees), 朴素贝叶斯(naive Bayes)</td>
</tr>
<tr>
<td align="left">多类聚类</td>
<td align="left">逻辑回归(logistic regression), 决策树(decision trees), 随机森林(random forests),朴素贝叶斯(naive Bayes)</td>
</tr>
<tr>
<td align="left">回归</td>
<td align="left">线性最小平方(linear lease squares), 套索(Lasso), 山脊回归(ridge regression), 决策树(decision trees), 随机森林(random forests), 梯度增加树(gradient-bosted trees), 等压回归(isotonic regression)</td>
</tr>
</tbody></table>
<h3 id="线性方法"><a href="#线性方法" class="headerlink" title="线性方法"></a><a target="_blank" rel="noopener" href="http://spark.apache.org/docs/latest/mllib-linear-methods.html">线性方法</a></h3><h4 id="Mathematical-formulation"><a href="#Mathematical-formulation" class="headerlink" title="Mathematical formulation"></a>Mathematical formulation</h4><h4 id="Loss-functions"><a href="#Loss-functions" class="headerlink" title="Loss functions"></a>Loss functions</h4><h4 id="Regularizers"><a href="#Regularizers" class="headerlink" title="Regularizers"></a>Regularizers</h4><h4 id="Optimization"><a href="#Optimization" class="headerlink" title="Optimization"></a>Optimization</h4><h4 id="分类-Classifacation"><a href="#分类-Classifacation" class="headerlink" title="分类(Classifacation)"></a>分类(Classifacation)</h4><p>最常用的分类是二分聚类，分为有名的正值和负值。如果超过两类，叫多类分类。spark.mllib支持两种线性分类方法</p>

            </div>
            <hr/>

            

    <div class="reprint" id="reprint-statement">
        
            <div class="reprint__author">
                <span class="reprint-meta" style="font-weight: bold;">
                    <i class="fas fa-user">
                        文章作者:
                    </i>
                </span>
                <span class="reprint-info">
                    <a href="http://libo0.gitee.io" rel="external nofollow noreferrer">libo</a>
                </span>
            </div>
            <div class="reprint__type">
                <span class="reprint-meta" style="font-weight: bold;">
                    <i class="fas fa-link">
                        文章链接:
                    </i>
                </span>
                <span class="reprint-info">
                    <a href="http://libo0.gitee.io/2019/01/13/Spark%E6%9C%BA%E5%99%A8%E5%AD%A6%E4%B9%A0%E7%AC%94%E8%AE%B0/">http://libo0.gitee.io/2019/01/13/Spark%E6%9C%BA%E5%99%A8%E5%AD%A6%E4%B9%A0%E7%AC%94%E8%AE%B0/</a>
                </span>
            </div>
            <div class="reprint__notice">
                <span class="reprint-meta" style="font-weight: bold;">
                    <i class="fas fa-copyright">
                        版权声明:
                    </i>
                </span>
                <span class="reprint-info">
                    本博客所有文章除特別声明外，均采用
                    <a href="https://creativecommons.org/licenses/by/4.0/deed.zh" rel="external nofollow noreferrer" target="_blank">CC BY 4.0</a>
                    许可协议。转载请注明来源
                    <a href="http://libo0.gitee.io" target="_blank">libo</a>
                    !
                </span>
            </div>
        
    </div>

    <script async defer>
      document.addEventListener("copy", function (e) {
        let toastHTML = '<span>复制成功，请遵循本文的转载规则</span><button class="btn-flat toast-action" onclick="navToReprintStatement()" style="font-size: smaller">查看</a>';
        M.toast({html: toastHTML})
      });

      function navToReprintStatement() {
        $("html, body").animate({scrollTop: $("#reprint-statement").offset().top - 80}, 800);
      }
    </script>



            <div class="tag_share" style="display: block;">
                <div class="post-meta__tag-list" style="display: inline-block;">
                    
                        <div class="article-tag">
                            
                                <a href="/tags/spark/">
                                    <span class="chip bg-color">spark</span>
                                </a>
                            
                                <a href="/tags/ml/">
                                    <span class="chip bg-color">ml</span>
                                </a>
                            
                        </div>
                    
                </div>
                <div class="post_share" style="zoom: 80%; width: fit-content; display: inline-block; float: right; margin: -0.15rem 0;">
                    <link rel="stylesheet" type="text/css" href="/libs/share/css/share.min.css">

<div id="article-share">
    
    
    <div class="social-share" data-sites="qq,qzone,wechat,weibo,douban,linkedin,twitter,facebook,google" data-wechat-qrcode-helper="<p>微信扫一扫即可分享！</p>"></div>
    <script src="/libs/share/js/social-share.min.js"></script>
    

    

</div>

                </div>
            </div>
            
                <style>
    #reward {
        margin: 40px 0;
        text-align: center;
    }

    #reward .reward-link {
        font-size: 1.4rem;
        line-height: 38px;
    }

    #reward .btn-floating:hover {
        box-shadow: 0 6px 12px rgba(0, 0, 0, 0.2), 0 5px 15px rgba(0, 0, 0, 0.2);
    }

    #rewardModal {
        width: 320px;
        height: 350px;
    }

    #rewardModal .reward-title {
        margin: 15px auto;
        padding-bottom: 5px;
    }

    #rewardModal .modal-content {
        padding: 10px;
    }

    #rewardModal .close {
        position: absolute;
        right: 15px;
        top: 15px;
        color: rgba(0, 0, 0, 0.5);
        font-size: 1.3rem;
        line-height: 20px;
        cursor: pointer;
    }

    #rewardModal .close:hover {
        color: #ef5350;
        transform: scale(1.3);
        -moz-transform:scale(1.3);
        -webkit-transform:scale(1.3);
        -o-transform:scale(1.3);
    }

    #rewardModal .reward-tabs {
        margin: 0 auto;
        width: 210px;
    }

    .reward-tabs .tabs {
        height: 38px;
        margin: 10px auto;
        padding-left: 0;
    }

    .reward-content ul {
        padding-left: 0 !important;
    }

    .reward-tabs .tabs .tab {
        height: 38px;
        line-height: 38px;
    }

    .reward-tabs .tab a {
        color: #fff;
        background-color: #ccc;
    }

    .reward-tabs .tab a:hover {
        background-color: #ccc;
        color: #fff;
    }

    .reward-tabs .wechat-tab .active {
        color: #fff !important;
        background-color: #22AB38 !important;
    }

    .reward-tabs .alipay-tab .active {
        color: #fff !important;
        background-color: #019FE8 !important;
    }

    .reward-tabs .reward-img {
        width: 210px;
        height: 210px;
    }
</style>

<div id="reward">
    <a href="#rewardModal" class="reward-link modal-trigger btn-floating btn-medium waves-effect waves-light red">赏</a>

    <!-- Modal Structure -->
    <div id="rewardModal" class="modal">
        <div class="modal-content">
            <a class="close modal-close"><i class="fas fa-times"></i></a>
            <h4 class="reward-title">你的赏识是我前进的动力</h4>
            <div class="reward-content">
                <div class="reward-tabs">
                    <ul class="tabs row">
                        <li class="tab col s6 alipay-tab waves-effect waves-light"><a href="#alipay">支付宝</a></li>
                        <li class="tab col s6 wechat-tab waves-effect waves-light"><a href="#wechat">微 信</a></li>
                    </ul>
                    <div id="alipay">
                        <img src="/medias/reward/alipay.jpg" class="reward-img" alt="支付宝打赏二维码">
                    </div>
                    <div id="wechat">
                        <img src="/medias/reward/wechat.png" class="reward-img" alt="微信打赏二维码">
                    </div>
                </div>
            </div>
        </div>
    </div>
</div>

<script>
    $(function () {
        $('.tabs').tabs();
    });
</script>
            
        </div>
    </div>

    
        <link rel="stylesheet" href="/libs/gitalk/gitalk.css">
<link rel="stylesheet" href="/css/my-gitalk.css">

<div class="card gitalk-card" data-aos="fade-up">
    <div class="comment_headling" style="font-size: 20px; font-weight: 700; position: relative; left: 20px; top: 15px; padding-bottom: 5px;">
        <i class="fas fa-comments fa-fw" aria-hidden="true"></i>
        <span>评论</span>
    </div>
    <div id="gitalk-container" class="card-content"></div>
</div>

<script src="/libs/gitalk/gitalk.min.js"></script>
<script>
    let gitalk = new Gitalk({
        clientID: '',
        clientSecret: '',
        repo: '',
        owner: '',
        admin: null,
        id: '2019-01-13T00-00-00',
        distractionFreeMode: false  // Facebook-like distraction free mode
    });

    gitalk.render('gitalk-container');
</script>
    

    

    

    

    

    

<article id="prenext-posts" class="prev-next articles">
    <div class="row article-row">
        
        <div class="article col s12 m6" data-aos="fade-up">
            <div class="article-badge left-badge text-color">
                <i class="fas fa-chevron-left"></i>&nbsp;上一篇</div>
            <div class="card">
                <a href="/2019/01/13/%E5%9C%A8windows%E4%B8%8B%E8%BF%9E%E6%8E%A5%E5%88%B0Virtualbox%E4%B8%8B%E7%9A%84linux%E8%99%9A%E6%8B%9F%E6%9C%BA/">
                    <div class="card-image">
                        
                        
                        <img src="/medias/featureimages/13.jpg" class="responsive-img" alt="在windows下连接到Virtualbox下的linux虚拟机">
                        
                        <span class="card-title">在windows下连接到Virtualbox下的linux虚拟机</span>
                    </div>
                </a>
                <div class="card-content article-content">
                    <div class="summary block-with-text">
                        
                            通过ssh 连接 virtualbox 的ubuntu18
首先虚拟机要安装 openssh-server



进入ubuntu虚拟机terminal,输入:
sudo apt install openssh-server

ssh 连接
                        
                    </div>
                    <div class="publish-info">
                        <span class="publish-date">
                            <i class="far fa-clock fa-fw icon-date"></i>2019-01-13
                        </span>
                        <span class="publish-author">
                            
                            <i class="fas fa-user fa-fw"></i>
                            libo
                            
                        </span>
                    </div>
                </div>
                
                <div class="card-action article-tags">
                    
                    <a href="/tags/linux/">
                        <span class="chip bg-color">linux</span>
                    </a>
                    
                    <a href="/tags/Virtualbox/">
                        <span class="chip bg-color">Virtualbox</span>
                    </a>
                    
                    <a href="/tags/windows/">
                        <span class="chip bg-color">windows</span>
                    </a>
                    
                    <a href="/tags/ssh/">
                        <span class="chip bg-color">ssh</span>
                    </a>
                    
                </div>
                
            </div>
        </div>
        
        
        <div class="article col s12 m6" data-aos="fade-up">
            <div class="article-badge right-badge text-color">
                下一篇&nbsp;<i class="fas fa-chevron-right"></i>
            </div>
            <div class="card">
                <a href="/2019/01/13/spark%E6%A6%82%E5%BF%B5%E5%AD%A6%E4%B9%A0/">
                    <div class="card-image">
                        
                        
                        <img src="/medias/featureimages/10.jpg" class="responsive-img" alt="spark概念学习">
                        
                        <span class="card-title">spark概念学习</span>
                    </div>
                </a>
                <div class="card-content article-content">
                    <div class="summary block-with-text">
                        
                            spark 概念学习Apache Spark 是一个快速通用的族计算系统。它为Java, Scala, Python, R 提供了高层次的API, 并且提供了一个优化的引擎来支持通用图计算。


Quick StartRDD Program
                        
                    </div>
                    <div class="publish-info">
                            <span class="publish-date">
                                <i class="far fa-clock fa-fw icon-date"></i>2019-01-13
                            </span>
                        <span class="publish-author">
                            
                            <i class="fas fa-user fa-fw"></i>
                            libo
                            
                        </span>
                    </div>
                </div>
                
                <div class="card-action article-tags">
                    
                    <a href="/tags/Spark/">
                        <span class="chip bg-color">Spark</span>
                    </a>
                    
                </div>
                
            </div>
        </div>
        
    </div>
</article>

</div>



<!-- 代码块功能依赖 -->
<script type="text/javascript" src="/libs/codeBlock/codeBlockFuction.js"></script>
<script type="text/javascript" src="/libs/highlight/highlight.pack.js"></script>
<!-- 代码语言 -->

<script type="text/javascript" src="/libs/codeBlock/codeLang.js"></script>

    
<!-- 代码块复制 -->

<script type="text/javascript" src="/libs/codeBlock/codeCopy.js"></script>


<!-- 代码块收缩 -->

<script type="text/javascript" src="/libs/codeBlock/codeShrink.js"></script>


<!-- 代码块折行 -->

<style type="text/css">
code[class*="language-"], pre[class*="language-"] { white-space: pre !important; }
</style>

    </div>
    <div id="toc-aside" class="expanded col l3 hide-on-med-and-down">
        <div class="toc-widget">
            <div class="toc-title"><i class="far fa-list-alt"></i>&nbsp;&nbsp;目录</div>
            <div id="toc-content"></div>
        </div>
    </div>
</div>

<!-- TOC 悬浮按钮. -->

<div id="floating-toc-btn" class="hide-on-med-and-down">
    <a class="btn-floating btn-large bg-color">
        <i class="fas fa-list-ul"></i>
    </a>
</div>


<script src="/libs/tocbot/tocbot.min.js"></script>
<script>
    $(function () {
        tocbot.init({
            tocSelector: '#toc-content',
            contentSelector: '#articleContent',
            headingsOffset: -($(window).height() * 0.4 - 45),
            collapseDepth: Number('0'),
            headingSelector: 'h2, h3, h4'
        });

        // modify the toc link href to support Chinese.
        let i = 0;
        let tocHeading = 'toc-heading-';
        $('#toc-content a').each(function () {
            $(this).attr('href', '#' + tocHeading + (++i));
        });

        // modify the heading title id to support Chinese.
        i = 0;
        $('#articleContent').children('h2, h3, h4').each(function () {
            $(this).attr('id', tocHeading + (++i));
        });

        // Set scroll toc fixed.
        let tocHeight = parseInt($(window).height() * 0.4 - 64);
        let $tocWidget = $('.toc-widget');
        $(window).scroll(function () {
            let scroll = $(window).scrollTop();
            /* add post toc fixed. */
            if (scroll > tocHeight) {
                $tocWidget.addClass('toc-fixed');
            } else {
                $tocWidget.removeClass('toc-fixed');
            }
        });

        
        /* 修复文章卡片 div 的宽度. */
        let fixPostCardWidth = function (srcId, targetId) {
            let srcDiv = $('#' + srcId);
            if (srcDiv.length === 0) {
                return;
            }

            let w = srcDiv.width();
            if (w >= 450) {
                w = w + 21;
            } else if (w >= 350 && w < 450) {
                w = w + 18;
            } else if (w >= 300 && w < 350) {
                w = w + 16;
            } else {
                w = w + 14;
            }
            $('#' + targetId).width(w);
        };

        // 切换TOC目录展开收缩的相关操作.
        const expandedClass = 'expanded';
        let $tocAside = $('#toc-aside');
        let $mainContent = $('#main-content');
        $('#floating-toc-btn .btn-floating').click(function () {
            if ($tocAside.hasClass(expandedClass)) {
                $tocAside.removeClass(expandedClass).hide();
                $mainContent.removeClass('l9');
            } else {
                $tocAside.addClass(expandedClass).show();
                $mainContent.addClass('l9');
            }
            fixPostCardWidth('artDetail', 'prenext-posts');
        });
        
    });
</script>

    

</main>



    <footer class="page-footer bg-color">
    <div class="container row center-align" style="margin-bottom: 15px !important;">
        <div class="col s12 m8 l8 copy-right">
            Copyright&nbsp;&copy;
            <span id="year">2020</span>
            <a href="http://libo0.gitee.io" target="_blank">libo</a>
            |&nbsp;Powered by&nbsp;<a href="https://hexo.io/" target="_blank">Hexo</a>
            |&nbsp;Theme&nbsp;<a href="https://github.com/blinkfox/hexo-theme-matery" target="_blank">Matery</a>
            <br>
            
            
            
            
            
            
            <span id="busuanzi_container_site_pv">
                |&nbsp;<i class="far fa-eye"></i>&nbsp;总访问量:&nbsp;<span id="busuanzi_value_site_pv"
                    class="white-color"></span>&nbsp;次
            </span>
            
            
            <span id="busuanzi_container_site_uv">
                |&nbsp;<i class="fas fa-users"></i>&nbsp;总访问人数:&nbsp;<span id="busuanzi_value_site_uv"
                    class="white-color"></span>&nbsp;人
            </span>
            
            <br>
            
            <span id="sitetime">载入运行时间...</span>
            <script>
                function siteTime() {
                    var seconds = 1000;
                    var minutes = seconds * 60;
                    var hours = minutes * 60;
                    var days = hours * 24;
                    var years = days * 365;
                    var today = new Date();
                    var startYear = "2020";
                    var startMonth = "3";
                    var startDate = "1";
                    var startHour = "18";
                    var startMinute = "30";
                    var startSecond = "0";
                    var todayYear = today.getFullYear();
                    var todayMonth = today.getMonth() + 1;
                    var todayDate = today.getDate();
                    var todayHour = today.getHours();
                    var todayMinute = today.getMinutes();
                    var todaySecond = today.getSeconds();
                    var t1 = Date.UTC(startYear, startMonth, startDate, startHour, startMinute, startSecond);
                    var t2 = Date.UTC(todayYear, todayMonth, todayDate, todayHour, todayMinute, todaySecond);
                    var diff = t2 - t1;
                    var diffYears = Math.floor(diff / years);
                    var diffDays = Math.floor((diff / days) - diffYears * 365);
                    var diffHours = Math.floor((diff - (diffYears * 365 + diffDays) * days) / hours);
                    var diffMinutes = Math.floor((diff - (diffYears * 365 + diffDays) * days - diffHours * hours) /
                        minutes);
                    var diffSeconds = Math.floor((diff - (diffYears * 365 + diffDays) * days - diffHours * hours -
                        diffMinutes * minutes) / seconds);
                    if (startYear == todayYear) {
                        document.getElementById("year").innerHTML = todayYear;
                        document.getElementById("sitetime").innerHTML = "本站已安全运行 " + diffDays + " 天 " + diffHours +
                            " 小时 " + diffMinutes + " 分钟 " + diffSeconds + " 秒";
                    } else {
                        document.getElementById("year").innerHTML = startYear + " - " + todayYear;
                        document.getElementById("sitetime").innerHTML = "本站已安全运行 " + diffYears + " 年 " + diffDays +
                            " 天 " + diffHours + " 小时 " + diffMinutes + " 分钟 " + diffSeconds + " 秒";
                    }
                }
                setInterval(siteTime, 1000);
            </script>
            
            <br>
            
        </div>
        <div class="col s12 m4 l4 social-link social-statis">
    <a href="https://github.com/libo-0" class="tooltipped" target="_blank" data-tooltip="访问我的GitHub" data-position="top" data-delay="50">
        <i class="fab fa-github"></i>
    </a>



    <a href="mailto:libo-0@foxmail.com" class="tooltipped" target="_blank" data-tooltip="邮件联系我" data-position="top" data-delay="50">
        <i class="fas fa-envelope-open"></i>
    </a>













    <a href="/atom.xml" class="tooltipped" target="_blank" data-tooltip="RSS 订阅" data-position="top" data-delay="50">
        <i class="fas fa-rss"></i>
    </a>

</div>
    </div>
</footer>

<div class="progress-bar"></div>


    <!-- 搜索遮罩框 -->
<div id="searchModal" class="modal">
    <div class="modal-content">
        <div class="search-header">
            <span class="title"><i class="fas fa-search"></i>&nbsp;&nbsp;搜索</span>
            <input type="search" id="searchInput" name="s" placeholder="请输入搜索的关键字"
                   class="search-input">
        </div>
        <div id="searchResult"></div>
    </div>
</div>

<script src="/js/search.js"></script>
<script type="text/javascript">
$(function () {
    searchFunc("/" + "search.xml", 'searchInput', 'searchResult');
});
</script>
    <!-- 回到顶部按钮 -->
<div id="backTop" class="top-scroll">
    <a class="btn-floating btn-large waves-effect waves-light" href="#!">
        <i class="fas fa-arrow-up"></i>
    </a>
</div>


    <script src="/libs/materialize/materialize.min.js"></script>
    <script src="/libs/masonry/masonry.pkgd.min.js"></script>
    <script src="/libs/aos/aos.js"></script>
    <script src="/libs/scrollprogress/scrollProgress.min.js"></script>
    <script src="/libs/lightGallery/js/lightgallery-all.min.js"></script>
    <script src="/js/matery.js"></script>

    <!-- Global site tag (gtag.js) - Google Analytics -->


    <!-- Baidu Analytics -->

    <!-- Baidu Push -->

<script>
    (function () {
        var bp = document.createElement('script');
        var curProtocol = window.location.protocol.split(':')[0];
        if (curProtocol === 'https') {
            bp.src = 'https://zz.bdstatic.com/linksubmit/push.js';
        } else {
            bp.src = 'http://push.zhanzhang.baidu.com/push.js';
        }
        var s = document.getElementsByTagName("script")[0];
        s.parentNode.insertBefore(bp, s);
    })();
</script>

    
    <script src="/libs/others/clicklove.js" async="async"></script>
    
    
    <script async src="/libs/others/busuanzi.pure.mini.js"></script>
    

    

    

    

    

    
    
    
    <script src="/libs/instantpage/instantpage.js" type="module"></script>
    

</body>

</html>
